
package edu.utah.seq.parsers;

import java.io.*;
import java.util.regex.*;
import edu.utah.seq.data.*;
import util.gen.*;
import java.util.*;

/**Parses an Eland extended _export.txt or _sorted.txt file into point data, split by chromosome and strand.
 * HWI-EAS240	FC2087UAAXX	8	1	237	408			TACACATGAATTCAACTTAAATTCCTTGTTAAAATT	ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZVUUVVU	chr6.fasta		147077758	F	36	73						Y
 * HWI-EAS240	FC2087UAAXX	8	1	311	379			TACTTGGTTAAGTAGGATGTTATTCTGCTTCTACAC	ZZZZZZZZZZZZZZZZZZZZZZZZVZRZZZUVLLLL	chr3.fasta		154412997	R	36	65						Y
 * HWI-EAS240	FC2087UAAXX	8	1	219	459			TACAACATGTACAAGCCTAAATCCTTTTAGCCAGAG	ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZJUOSOUQ	NM											Y
 * HWI-EAS240	FC2087UAAXX	8	1	582	773			TCATGTTAAAGTGATTGTGATGTTTGAAAACCAATG	ZZZZZZZZZZZZZZZZZZZZZVZZZZJZZZSUSVVG	chr6.fasta		123151134	R	36	54						Y
 * HWI-EAS240	FC2087UAAXX	8	1	820	387			TAAATTGTAACATAAAATTCTTATGAAATTACCTCA	ZKZZZZZZZZZZZZZZZZZZZZZZIZOZZZOODOOH	chr16.fasta		58060756	R	36	39						N  
 * For each sequence a single hit is assigned to the center position of the read.  Files are saved using the bar format.
 * Final positions are in interbase coordinates (0 start, end excluded).
 * @author david.nix@hci.utah.edu 
 **/
public class ElandParser {
	//fields
	private File[] dataFiles;
	private File saveDirectory;
	private File workingFile;
	private String versionedGenome;
	private int shift3Prime = 0;
	private int readLength;
	private HashMap <String, ArrayList<Point>> data = new HashMap <String, ArrayList<Point>> ();
	private int totalNumNoMatch = 0;
	private int totalNumMatch = 0;
	private boolean parseStandAlone = false;
	private float minScore = 13;
	


	//constructors
	public ElandParser(String[] args){
		processArgs(args);
		System.out.println("Adding "+shift3Prime+"bp to the 3' end of each read.");
		System.out.println("Minimum alignment score "+ minScore);
		System.out.println("\nConverting...");

		//for each export file, parse and split by strand and chromosome
		for (int i=0; i< dataFiles.length; i++){
			//set working objects and parse tag file name
			workingFile = dataFiles[i];
			System.out.println("\t"+workingFile);
			//split tag file to chromosome specific temp files in barDirectory
			if (loadWorkingFile() == false) Misc.printExit("\nError: failed to parse, aborting.\n");
		}
		
		//sort, make point data, and save
		System.out.print("Sorting and Saving");
		makePointData();
		System.out.println();
		
		//stats
		System.out.println("Stats...");
		System.out.println("\t"+totalNumNoMatch+"\tNo match, poor quality, or multiple matches");
		System.out.println("\t"+totalNumMatch+"\tMatches");
		
		System.out.println("\nDone!\n");
	}
	
	/**Makes the PointData hash after sorting, writes to disk.*/
	public void makePointData(){
		ComparatorPointPosition comp = new ComparatorPointPosition();
		
		//for each stranded chromosome
		Iterator<String> it = data.keySet().iterator();
		while (it.hasNext()){
			String chromName = it.next();
			//parse strand and chromosome
			int len = chromName.length();
			String strand = chromName.substring(len-1);
			String chromosome = chromName.substring(0, len-2);
			//convert to an array
			ArrayList<Point> al = data.get(chromName);
			Point[] points = new Point[al.size()];
			al.toArray(points);
			//sort
			Arrays.sort(points, comp);			
			//make notes
			HashMap <String,String> notes = new HashMap <String,String> ();
			notes.put(BarParser.GRAPH_TYPE_TAG, BarParser.GRAPH_TYPE_BAR);
			notes.put(BarParser.SOURCE_TAG, IO.concatinateFileFullPathNames(dataFiles, ","));
			notes.put(BarParser.STRAND_TAG, strand);
			notes.put(BarParser.UNIT_TAG, "Eland extended alignment quality score");
			notes.put(BarParser.READ_LENGTH_TAG, readLength+"");
			notes.put(BarParser.BP_3_PRIME_SHIFT, shift3Prime+"");
			notes.put(BarParser.DESCRIPTION_TAG, "Generated by running the ElandParser on Solexa ELAND file(s), the position is assigned to the middle of the read, interbase coordinates");
			//make an Info object  public Info (String name, String versionedGenome, String chromosome, String strand, int readLength, HashMap<String,String> notes){
			Info info = new Info(chromName, versionedGenome, chromosome, strand, readLength, notes);
			//make pd
			PointData pd = Point.extractPositionScores(points);			
			pd.setInfo(info);
			//write to file
			pd.writePointData(saveDirectory);
			//System.out.println("\n**********************");
			//System.out.println(pd.getInfo());
			//System.out.println("\tNum Obs "+pd.getNumberObservations());
			//Misc.printArray(pd.getPositions());
			//Misc.printArray(pd.getScores());
			//cleanup
			pd = null;
			al = null;
			points = null;
			System.out.print(".");
		}
	}


	/**Splits a tag file by chromosome and strand to seperate files.*/
	public boolean loadWorkingFile(){
		try{
			int numMatch = 0;
			int numNoMatch = 0;
			//get reader
			BufferedReader in = IO.fetchBufferedReader(workingFile);
			
			//read in first good line and parse length
			String line = in.readLine();
			ElandExportLine ex;
			if (parseStandAlone) ex = new ElandExportLine(line, shift3Prime, true);
			else  ex = new ElandExportLine(line, shift3Prime, minScore);
			boolean noGood = true;
			if (ex.parsed() == false){
				numNoMatch++;
				while ((line = in.readLine()) !=null){
					if (parseStandAlone) ex = new ElandExportLine(line, shift3Prime, true);
					else  ex = new ElandExportLine(line, shift3Prime,minScore);
					if (ex.parsed()) {
						noGood = false;
						numMatch++;						
						break;
					}
					numNoMatch++;
				}
				if (noGood) {
					System.err.println("\nFailed to find any good reads from the following file, aborting.\n\t"+workingFile);
					return false;
				}
			}
			else numMatch++;
			//parse length
			readLength = ex.fetchReadLength();
			//set in data
			String chromStrandName = ex.fetchChromStrandName();
			ArrayList <Point> al;
			if (data.containsKey(chromStrandName) == false){
				al = new ArrayList <Point> ();
				data.put(chromStrandName, al);
			}
			else al = data.get(chromStrandName);
			al.add(ex.fetchPoint());
			
			//read in remainder
			while ((line = in.readLine()) !=null){
				line = line.trim();
				if (line.length()==0 || line.startsWith("#")) continue;
				if (parseStandAlone) ex = new ElandExportLine(line, shift3Prime, true);
				else  ex = new ElandExportLine(line, shift3Prime, minScore);
				if (ex.parsed()){
					chromStrandName = ex.fetchChromStrandName();
					if (data.containsKey(chromStrandName) == false){
						al = new ArrayList();
						data.put(chromStrandName, al);
					}
					else al = data.get(chromStrandName);
					al.add(ex.fetchPoint());
					numMatch++;
				}
				else {
					numNoMatch++;
				}
			}
			in.close();
			//add to totals
			totalNumMatch += numMatch;
			totalNumNoMatch += numNoMatch;
			//Sum stats
			System.out.println("\t\t"+numMatch+"\t# Match \t"+numNoMatch+"\t# No Match ");
			return true;
		} catch (Exception e){
			e.printStackTrace();
			return false;
		}
	}


	public static void main(String[] args) {
		if (args.length ==0){
			printDocs();
			System.exit(0);
		}
		new ElandParser(args);
	}		

	/**This method will process each argument and assign new varibles*/
	public void processArgs(String[] args){
		Pattern pat = Pattern.compile("-[a-z]");
		File forExtraction = null;
		System.out.println("\nArguments: "+Misc.stringArrayToString(args, " ")+"\n");
		for (int i = 0; i<args.length; i++){
			String lcArg = args[i].toLowerCase();
			Matcher mat = pat.matcher(lcArg);
			if (mat.matches()){
				char test = args[i].charAt(1);
				try{
					switch (test){
					case 'f': forExtraction = new File(args[i+1]); i++; break;
					case 'v': versionedGenome = args[i+1]; i++; break;
					case 's': shift3Prime = Integer.parseInt(args[i+1]); i++; break;
					case 'm': minScore = Float.parseFloat(args[i+1]); i++; break;
					case 'r': saveDirectory = new File (args[i+1]); i++; break;
					case 'p': parseStandAlone = true; break;
					case 'h': printDocs(); System.exit(0);
					default: System.out.println("\nProblem, unknown option! " + mat.group());
					}
				}
				catch (Exception e){
					Misc.printExit("\nSorry, something doesn't look right with this parameter: -"+test+"\n");
				}
			}
		}
		
		//pull files
		File[][] tot = new File[6][];
		tot[0] = IO.extractFiles(forExtraction,"_export.txt");
		tot[1] = IO.extractFiles(forExtraction,"_export.txt.zip");
		tot[2] = IO.extractFiles(forExtraction,"_sorted.txt");
		tot[3] = IO.extractFiles(forExtraction,"_sorted.txt.zip");
		tot[4] = IO.extractFiles(forExtraction,"_export.txt.gz");
		tot[5] = IO.extractFiles(forExtraction,"_sorted.txt.gz");
		dataFiles = IO.collapseFileArray(tot);
		if (dataFiles == null || dataFiles.length==0) dataFiles = IO.extractFiles(forExtraction);
		if (dataFiles == null || dataFiles.length ==0 || dataFiles[0].canRead() == false) Misc.printExit("\nError: cannot find your xxx_export.txt(.zip) or xxx_sorted.txt(.zip) file(s)!\n");
		if (versionedGenome == null) Misc.printExit("\nPlease enter a genome version recognized by UCSC, see http://genome.ucsc.edu/FAQ/FAQreleases.\n");
		if (saveDirectory == null) {
			saveDirectory = new File (dataFiles[0].getParentFile(), "PointData"+shift3Prime+"bp");
			saveDirectory.mkdir();
		}
		else if (saveDirectory.exists() == false) saveDirectory.mkdir();
	}	

	public static void printDocs(){
		System.out.println("\n" +
				"**************************************************************************************\n" +
				"**                               ElandParser: Dec 2008                              **\n" +
				"**************************************************************************************\n" +
				"Splits and converts Eland Extended xxx_export.txt or xxx_sorted.txt files\n" +
				"into center position alignment scored binary xxx.bar files. Coordinates are in\n" +
				"interbase coordiantes (zero based, end excluded). These can be directly viewed in IGB.\n\n" +

				"-v Versioned Genome (ie hg18, dm2, ce2, mm8), see UCSC Browser,\n"+
				"      http://genome.ucsc.edu/FAQ/FAQreleases.\n" +
				"-m Minimum aligment score, Phred scale, defaults to 13. Not used with stand alone.\n"+
				"-f The full path directory/file name of your xxx_export.txt(.zip/.gz) or\n" +
				"      xxx_sorted.txt(.zip/.gz) file(s).\n" +
				"-r Full path directory name for saving the results, defaults to export.txt parent.\n"+
				"-s Shift centered position N bps 3' to accomodate chIP-seq fragment size. Stranded.\n" +
				"      Note, this is far less than 1/2 the expected fragment size, determine best\n" +
				"      value by visual inspection of likely positives. Defaults to 0. If you plan on\n" +
				"      filtering your PointData, don't shift their positions, do it in the filter app.\n"+
				"-p Parse stand alone Eland output file.\n"+


				"\nExample: java -Xmx1500M -jar pathToUSeq/Apps/ElandParser -f /Solexa/Run7/\n" +
				"     -v hg18 -s 38 -r /Solexa/ParsedData/PolIII/\n\n" +

		"**************************************************************************************\n");

	}	

}
